In [1]:
import pandas as pd
import plotly.express as px
Load the CSV files¶
In [2]:
df_consumption = pd.read_csv(f"../data/water_consumption_by_second.csv", parse_dates=["second_bucket"])
df_consumption.sort_values("second_bucket", inplace=True)
df_consumption.head()
Out[2]:
| second_bucket | pulse_count | liters | inserted_at | |
|---|---|---|---|---|
| 0 | 2025-03-26 19:22:10+01:00 | 4.0 | 0.008264 | 2025-04-03 20:23:14.110 +0200 |
| 1 | 2025-03-26 19:22:11+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 |
| 2 | 2025-03-26 19:22:12+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 |
| 3 | 2025-03-26 19:22:13+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 |
| 4 | 2025-03-26 19:22:14+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 |
In [3]:
df_events = pd.read_csv("../data/event_labels.csv", parse_dates=["start_timestamp", "end_timestamp"])
df_events.head(20)
Out[3]:
| category | tag | start_timestamp | end_timestamp | |
|---|---|---|---|---|
| 0 | Grifo | NaN | 2025-03-26 19:23:29.825000+01:00 | 2025-03-26 19:23:52.072000+01:00 |
| 1 | Grifo | NaN | 2025-03-26 19:24:11.621000+01:00 | 2025-03-26 19:24:49.803000+01:00 |
| 2 | Grifo | NaN | 2025-03-26 19:25:20.283000+01:00 | 2025-03-26 19:25:39.161000+01:00 |
| 3 | Grifo | NaN | 2025-03-26 19:26:19.152000+01:00 | 2025-03-26 19:28:06.471000+01:00 |
| 4 | Grifo | NaN | 2025-03-26 19:49:45.658000+01:00 | 2025-03-26 19:50:20.381000+01:00 |
| 5 | Grifo | NaN | 2025-03-26 19:50:53.658000+01:00 | 2025-03-26 19:51:09.181000+01:00 |
| 6 | Grifo | NaN | 2025-03-26 19:51:23.135000+01:00 | 2025-03-26 19:51:35.214000+01:00 |
| 7 | Cisterna | #Mamá | 2025-03-26 19:52:02.819000+01:00 | 2025-03-26 19:52:26.086000+01:00 |
| 8 | Cisterna | #Hijo | 2025-03-26 19:53:22.564000+01:00 | 2025-03-26 19:53:55.714000+01:00 |
| 9 | Cisterna | #Hijo | 2025-03-26 19:54:00.104000+01:00 | 2025-03-26 19:55:10.170000+01:00 |
| 10 | Cisterna | #Papá | 2025-03-26 19:55:38.464000+01:00 | 2025-03-26 19:56:38.858000+01:00 |
| 11 | Cisterna | #Papá | 2025-03-26 19:56:42.738000+01:00 | 2025-03-26 19:57:41.952000+01:00 |
| 12 | Grifo | NaN | 2025-03-26 22:27:16.237000+01:00 | 2025-03-26 22:27:26.538000+01:00 |
| 13 | Ducha | #Mamá | 2025-03-27 06:06:47.967000+01:00 | 2025-03-27 06:11:19.173000+01:00 |
| 14 | Ducha | #Papá | 2025-03-27 07:30:32.701000+01:00 | 2025-03-27 07:34:38.051000+01:00 |
| 15 | Cisterna | #Hijo | 2025-03-27 18:56:45.567000+01:00 | 2025-03-27 18:57:12.991000+01:00 |
| 16 | Grifo | NaN | 2025-03-27 19:15:26.515000+01:00 | 2025-03-27 19:15:43.638000+01:00 |
| 17 | Lavadora | NaN | 2025-03-27 22:25:18.400000+01:00 | 2025-03-27 23:19:24.687000+01:00 |
| 18 | Cisterna | #Papá | 2025-03-27 22:49:30.124000+01:00 | 2025-03-27 22:50:26.727000+01:00 |
| 19 | Cisterna | #Hijo | 2025-03-27 23:54:25.349000+01:00 | 2025-03-27 23:56:16.424000+01:00 |
Calculate the categories and tags for each second bucket¶
In [4]:
# Create columns for categories and tags
df_consumption["category_list"] = [[] for _ in range(len(df_consumption))]
df_consumption["tag_list"] = [[] for _ in range(len(df_consumption))]
# Assign categories and tags
for _, event in df_events.iterrows():
mask = (df_consumption["second_bucket"] >= event["start_timestamp"]) & (df_consumption["second_bucket"] <= event["end_timestamp"])
df_consumption.loc[mask, "category_list"] = df_consumption.loc[mask, "category_list"].apply(
lambda x: x + [event["category"]]
)
df_consumption.loc[mask, "tag_list"] = df_consumption.loc[mask, "tag_list"].apply(
lambda x: x + [event["tag"]]
)
In [5]:
# Fill empty lists with 'Unknown'
df_consumption["category_list"] = df_consumption["category_list"].apply(lambda x: x if x else ["Unknown"])
df_consumption["tag_list"] = df_consumption["tag_list"].apply(lambda x: x if x else ["Unknown"])
# Explode the arrays
df_exploded = df_consumption.explode("category_list").explode("tag_list")
# Split liters evenly among labels
df_exploded["label_count"] = df_exploded.groupby("second_bucket")["category_list"].transform("count")
df_exploded["liters_split"] = df_exploded["liters"] / df_exploded["label_count"]
df_exploded.head()
Out[5]:
| second_bucket | pulse_count | liters | inserted_at | category_list | tag_list | label_count | liters_split | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2025-03-26 19:22:10+01:00 | 4.0 | 0.008264 | 2025-04-03 20:23:14.110 +0200 | Unknown | Unknown | 1 | 0.008264 |
| 1 | 2025-03-26 19:22:11+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 | Unknown | Unknown | 1 | 0.000000 |
| 2 | 2025-03-26 19:22:12+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 | Unknown | Unknown | 1 | 0.000000 |
| 3 | 2025-03-26 19:22:13+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 | Unknown | Unknown | 1 | 0.000000 |
| 4 | 2025-03-26 19:22:14+01:00 | 0.0 | 0.000000 | 2025-04-03 20:23:14.110 +0200 | Unknown | Unknown | 1 | 0.000000 |
Plot Consumption by Category¶
In [6]:
df_by_category = (
df_exploded.groupby("category_list", as_index=False)["liters_split"].sum()
.sort_values("liters_split", ascending=False)
)
total_liters_category = df_by_category["liters_split"].sum()
df_by_category["percent"] = 100 * df_by_category["liters_split"] / total_liters_category
df_by_category["label"] = df_by_category["liters_split"].round(1).astype(str) + " L (" + df_by_category["percent"].round(1).astype(str) + "%)"
fig_cat = px.bar(
df_by_category,
x="category_list",
y="liters_split",
color="category_list",
text="label",
title="Water Consumption by Category (Including Unknown)",
labels={"category_list": "Category", "liters_split": "Liters"},
)
fig_cat.update_layout(
xaxis_title="Category",
yaxis_title="Total Liters",
height=500,
uniformtext_minsize=8,
uniformtext_mode='show'
)
fig_cat.show()
Plot Consumption by Tag¶
In [7]:
df_by_tag = (
df_exploded.groupby("tag_list", as_index=False)["liters_split"].sum()
.sort_values("liters_split", ascending=False)
)
total_liters_tag = df_by_tag["liters_split"].sum()
df_by_tag["percent"] = 100 * df_by_tag["liters_split"] / total_liters_tag
df_by_tag["label"] = df_by_tag["liters_split"].round(1).astype(str) + " L (" + df_by_tag["percent"].round(1).astype(str) + "%)"
fig_tag = px.bar(
df_by_tag,
x="tag_list",
y="liters_split",
color="tag_list",
text="label",
title="Water Consumption by Tag (Including Unknown)",
labels={"tag_list": "Tag", "liters_split": "Liters"},
)
fig_tag.update_layout(
xaxis_title="Tag",
yaxis_title="Total Liters",
height=500,
uniformtext_minsize=8,
uniformtext_mode='show'
)
fig_tag.show()
In [ ]: